1 package org.apache.lucene.search.postingshighlight;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.IOException;
21 import java.nio.charset.StandardCharsets;
22 import java.text.BreakIterator;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.Comparator;
26 import java.util.HashMap;
27 import java.util.List;
28 import java.util.Locale;
29 import java.util.Map;
30 import java.util.PriorityQueue;
31 import java.util.SortedSet;
32 import java.util.TreeSet;
33
34 import org.apache.lucene.analysis.Analyzer;
35 import org.apache.lucene.index.FieldInfo;
36 import org.apache.lucene.index.IndexOptions;
37 import org.apache.lucene.index.IndexReader;
38 import org.apache.lucene.index.IndexReaderContext;
39 import org.apache.lucene.index.LeafReader;
40 import org.apache.lucene.index.LeafReaderContext;
41 import org.apache.lucene.index.MultiReader;
42 import org.apache.lucene.index.PostingsEnum;
43 import org.apache.lucene.index.ReaderUtil;
44 import org.apache.lucene.index.StoredFieldVisitor;
45 import org.apache.lucene.index.Term;
46 import org.apache.lucene.index.Terms;
47 import org.apache.lucene.index.TermsEnum;
48 import org.apache.lucene.search.IndexSearcher;
49 import org.apache.lucene.search.Query;
50 import org.apache.lucene.search.ScoreDoc;
51 import org.apache.lucene.search.TopDocs;
52 import org.apache.lucene.util.BytesRef;
53 import org.apache.lucene.util.InPlaceMergeSorter;
54 import org.apache.lucene.util.UnicodeUtil;
55 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97 public class PostingsHighlighter {
98
99
100
101
102
103
104 private static final IndexSearcher EMPTY_INDEXSEARCHER;
105 static {
106 try {
107 IndexReader emptyReader = new MultiReader();
108 EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
109 EMPTY_INDEXSEARCHER.setQueryCache(null);
110 } catch (IOException bogus) {
111 throw new RuntimeException(bogus);
112 }
113 }
114
115
116
117 public static final int DEFAULT_MAX_LENGTH = 10000;
118
119 private final int maxLength;
120
121
122
123 private PassageFormatter defaultFormatter;
124
125
126
127 private PassageScorer defaultScorer;
128
129
130
131
132 public PostingsHighlighter() {
133 this(DEFAULT_MAX_LENGTH);
134 }
135
136
137
138
139
140
141 public PostingsHighlighter(int maxLength) {
142 if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
143
144
145 throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
146 }
147 this.maxLength = maxLength;
148 }
149
150
151
152
153
154 protected BreakIterator getBreakIterator(String field) {
155 return BreakIterator.getSentenceInstance(Locale.ROOT);
156 }
157
158
159
160
161
162 protected PassageFormatter getFormatter(String field) {
163 if (defaultFormatter == null) {
164 defaultFormatter = new DefaultPassageFormatter();
165 }
166 return defaultFormatter;
167 }
168
169
170
171
172
173 protected PassageScorer getScorer(String field) {
174 if (defaultScorer == null) {
175 defaultScorer = new PassageScorer();
176 }
177 return defaultScorer;
178 }
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195 public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
196 return highlight(field, query, searcher, topDocs, 1);
197 }
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217 public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
218 Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages });
219 return res.get(field);
220 }
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247 public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
248 int maxPassages[] = new int[fields.length];
249 Arrays.fill(maxPassages, 1);
250 return highlightFields(fields, query, searcher, topDocs, maxPassages);
251 }
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281 public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException {
282 final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
283 int docids[] = new int[scoreDocs.length];
284 for (int i = 0; i < docids.length; i++) {
285 docids[i] = scoreDocs[i].doc;
286 }
287
288 return highlightFields(fields, query, searcher, docids, maxPassages);
289 }
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311 public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
312 Map<String,String[]> snippets = new HashMap<>();
313 for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
314 Object[] snippetObjects = ent.getValue();
315 String[] snippetStrings = new String[snippetObjects.length];
316 snippets.put(ent.getKey(), snippetStrings);
317 for(int i=0;i<snippetObjects.length;i++) {
318 Object snippet = snippetObjects[i];
319 if (snippet != null) {
320 snippetStrings[i] = snippet.toString();
321 }
322 }
323 }
324
325 return snippets;
326 }
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350 protected Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
351 if (fieldsIn.length < 1) {
352 throw new IllegalArgumentException("fieldsIn must not be empty");
353 }
354 if (fieldsIn.length != maxPassagesIn.length) {
355 throw new IllegalArgumentException("invalid number of maxPassagesIn");
356 }
357 SortedSet<Term> queryTerms = new TreeSet<>();
358 EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
359
360 IndexReaderContext readerContext = searcher.getIndexReader().getContext();
361 List<LeafReaderContext> leaves = readerContext.leaves();
362
363
364 int[] docids = new int[docidsIn.length];
365 System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length);
366 final String fields[] = new String[fieldsIn.length];
367 System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
368 final int maxPassages[] = new int[maxPassagesIn.length];
369 System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
370
371
372 Arrays.sort(docids);
373 new InPlaceMergeSorter() {
374
375 @Override
376 protected void swap(int i, int j) {
377 String tmp = fields[i];
378 fields[i] = fields[j];
379 fields[j] = tmp;
380 int tmp2 = maxPassages[i];
381 maxPassages[i] = maxPassages[j];
382 maxPassages[j] = tmp2;
383 }
384
385 @Override
386 protected int compare(int i, int j) {
387 return fields[i].compareTo(fields[j]);
388 }
389
390 }.sort(0, fields.length);
391
392
393 String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
394
395 Map<String,Object[]> highlights = new HashMap<>();
396 for (int i = 0; i < fields.length; i++) {
397 String field = fields[i];
398 int numPassages = maxPassages[i];
399 Term floor = new Term(field, "");
400 Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
401 SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
402
403
404
405 BytesRef terms[] = new BytesRef[fieldTerms.size()];
406 int termUpto = 0;
407 for(Term term : fieldTerms) {
408 terms[termUpto++] = term.bytes();
409 }
410 Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query);
411
412 Object[] result = new Object[docids.length];
413 for (int j = 0; j < docidsIn.length; j++) {
414 result[j] = fieldHighlights.get(docidsIn[j]);
415 }
416 highlights.put(field, result);
417 }
418 return highlights;
419 }
420
421
422
423
424
425
426
427 protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
428 String contents[][] = new String[fields.length][docids.length];
429 char valueSeparators[] = new char[fields.length];
430 for (int i = 0; i < fields.length; i++) {
431 valueSeparators[i] = getMultiValuedSeparator(fields[i]);
432 }
433 LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
434 for (int i = 0; i < docids.length; i++) {
435 searcher.doc(docids[i], visitor);
436 for (int j = 0; j < fields.length; j++) {
437 contents[j][i] = visitor.getValue(j).toString();
438 }
439 visitor.reset();
440 }
441 return contents;
442 }
443
444
445
446
447
448
449
450 protected char getMultiValuedSeparator(String field) {
451 return ' ';
452 }
453
454
455
456
457
458
459
460 protected Analyzer getIndexAnalyzer(String field) {
461 return null;
462 }
463
464 private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<LeafReaderContext> leaves, int maxPassages, Query query) throws IOException {
465 Map<Integer,Object> highlights = new HashMap<>();
466
467 PassageFormatter fieldFormatter = getFormatter(field);
468 if (fieldFormatter == null) {
469 throw new NullPointerException("PassageFormatter cannot be null");
470 }
471
472
473 Analyzer analyzer = getIndexAnalyzer(field);
474 CharacterRunAutomaton automata[] = new CharacterRunAutomaton[0];
475 if (analyzer != null) {
476 automata = MultiTermHighlighting.extractAutomata(query, field);
477 }
478
479
480 if (automata.length > 0) {
481 BytesRef newTerms[] = new BytesRef[terms.length + 1];
482 System.arraycopy(terms, 0, newTerms, 0, terms.length);
483 terms = newTerms;
484 }
485
486
487
488 PostingsEnum postings[] = null;
489 TermsEnum termsEnum = null;
490 int lastLeaf = -1;
491
492 for (int i = 0; i < docids.length; i++) {
493 String content = contents[i];
494 if (content.length() == 0) {
495 continue;
496 }
497 bi.setText(content);
498 int doc = docids[i];
499 int leaf = ReaderUtil.subIndex(doc, leaves);
500 LeafReaderContext subContext = leaves.get(leaf);
501 LeafReader r = subContext.reader();
502
503 assert leaf >= lastLeaf;
504
505
506 if (leaf != lastLeaf) {
507 Terms t = r.terms(field);
508 if (t != null) {
509 if (!t.hasOffsets()) {
510
511 throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
512 }
513 termsEnum = t.iterator();
514 postings = new PostingsEnum[terms.length];
515 } else {
516 termsEnum = null;
517 }
518 }
519 if (termsEnum == null) {
520 continue;
521 }
522
523
524 if (automata.length > 0) {
525 PostingsEnum dp = MultiTermHighlighting.getDocsEnum(analyzer.tokenStream(field, content), automata);
526 dp.advance(doc - subContext.docBase);
527 postings[terms.length-1] = dp;
528 }
529
530 Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
531
532 if (passages.length == 0) {
533
534 passages = getEmptyHighlight(field, bi, maxPassages);
535 }
536
537 if (passages.length > 0) {
538 highlights.put(doc, fieldFormatter.format(passages, content));
539 }
540
541 lastLeaf = leaf;
542 }
543
544 return highlights;
545 }
546
547
548
549
550 private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc,
551 TermsEnum termsEnum, PostingsEnum[] postings, int n) throws IOException {
552 PassageScorer scorer = getScorer(field);
553 if (scorer == null) {
554 throw new NullPointerException("PassageScorer cannot be null");
555 }
556 PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>();
557 float weights[] = new float[terms.length];
558
559 for (int i = 0; i < terms.length; i++) {
560 PostingsEnum de = postings[i];
561 int pDoc;
562 if (de == EMPTY) {
563 continue;
564 } else if (de == null) {
565 postings[i] = EMPTY;
566 if (!termsEnum.seekExact(terms[i])) {
567 continue;
568 }
569 de = postings[i] = termsEnum.postings(null, PostingsEnum.OFFSETS);
570 assert de != null;
571 pDoc = de.advance(doc);
572 } else {
573 pDoc = de.docID();
574 if (pDoc < doc) {
575 pDoc = de.advance(doc);
576 }
577 }
578
579 if (doc == pDoc) {
580 weights[i] = scorer.weight(contentLength, de.freq());
581 de.nextPosition();
582 pq.add(new OffsetsEnum(de, i));
583 }
584 }
585
586 pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE));
587
588 PriorityQueue<Passage> passageQueue = new PriorityQueue<>(n, new Comparator<Passage>() {
589 @Override
590 public int compare(Passage left, Passage right) {
591 if (left.score < right.score) {
592 return -1;
593 } else if (left.score > right.score) {
594 return 1;
595 } else {
596 return left.startOffset - right.startOffset;
597 }
598 }
599 });
600 Passage current = new Passage();
601
602 OffsetsEnum off;
603 while ((off = pq.poll()) != null) {
604 final PostingsEnum dp = off.dp;
605 int start = dp.startOffset();
606 assert start >= 0;
607 int end = dp.endOffset();
608
609
610
611 assert EMPTY.startOffset() == Integer.MAX_VALUE;
612 if (start < contentLength && end > contentLength) {
613 continue;
614 }
615 if (start >= current.endOffset) {
616 if (current.startOffset >= 0) {
617
618 current.score *= scorer.norm(current.startOffset);
619
620 if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
621 current.reset();
622 } else {
623 passageQueue.offer(current);
624 if (passageQueue.size() > n) {
625 current = passageQueue.poll();
626 current.reset();
627 } else {
628 current = new Passage();
629 }
630 }
631 }
632
633 if (start >= contentLength) {
634 Passage passages[] = new Passage[passageQueue.size()];
635 passageQueue.toArray(passages);
636 for (Passage p : passages) {
637 p.sort();
638 }
639
640 Arrays.sort(passages, new Comparator<Passage>() {
641 @Override
642 public int compare(Passage left, Passage right) {
643 return left.startOffset - right.startOffset;
644 }
645 });
646 return passages;
647 }
648
649 assert BreakIterator.DONE < 0;
650 current.startOffset = Math.max(bi.preceding(start+1), 0);
651 current.endOffset = Math.min(bi.next(), contentLength);
652 }
653 int tf = 0;
654 while (true) {
655 tf++;
656 BytesRef term = terms[off.id];
657 if (term == null) {
658
659 term = off.dp.getPayload();
660 assert term != null;
661 }
662 current.addMatch(start, end, term);
663 if (off.pos == dp.freq()) {
664 break;
665 } else {
666 off.pos++;
667 dp.nextPosition();
668 start = dp.startOffset();
669 end = dp.endOffset();
670 }
671 if (start >= current.endOffset || end > contentLength) {
672 pq.offer(off);
673 break;
674 }
675 }
676 current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
677 }
678
679
680 assert false;
681 return null;
682 }
683
684
685
686
687
688 protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
689
690 List<Passage> passages = new ArrayList<>();
691 int pos = bi.current();
692 assert pos == 0;
693 while (passages.size() < maxPassages) {
694 int next = bi.next();
695 if (next == BreakIterator.DONE) {
696 break;
697 }
698 Passage passage = new Passage();
699 passage.score = Float.NaN;
700 passage.startOffset = pos;
701 passage.endOffset = next;
702 passages.add(passage);
703 pos = next;
704 }
705
706 return passages.toArray(new Passage[passages.size()]);
707 }
708
709 private static class OffsetsEnum implements Comparable<OffsetsEnum> {
710 PostingsEnum dp;
711 int pos;
712 int id;
713
714 OffsetsEnum(PostingsEnum dp, int id) throws IOException {
715 this.dp = dp;
716 this.id = id;
717 this.pos = 1;
718 }
719
720 @Override
721 public int compareTo(OffsetsEnum other) {
722 try {
723 int off = dp.startOffset();
724 int otherOff = other.dp.startOffset();
725 if (off == otherOff) {
726 return id - other.id;
727 } else {
728 return Integer.compare(off, otherOff);
729 }
730 } catch (IOException e) {
731 throw new RuntimeException(e);
732 }
733 }
734 }
735
736 private static final PostingsEnum EMPTY = new PostingsEnum() {
737
738 @Override
739 public int nextPosition() throws IOException { return -1; }
740
741 @Override
742 public int startOffset() throws IOException { return Integer.MAX_VALUE; }
743
744 @Override
745 public int endOffset() throws IOException { return Integer.MAX_VALUE; }
746
747 @Override
748 public BytesRef getPayload() throws IOException { return null; }
749
750 @Override
751 public int freq() throws IOException { return 0; }
752
753 @Override
754 public int docID() { return NO_MORE_DOCS; }
755
756 @Override
757 public int nextDoc() throws IOException { return NO_MORE_DOCS; }
758
759 @Override
760 public int advance(int target) throws IOException { return NO_MORE_DOCS; }
761
762 @Override
763 public long cost() { return 0; }
764 };
765
766 private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
767 private final String fields[];
768 private final char valueSeparators[];
769 private final int maxLength;
770 private final StringBuilder builders[];
771 private int currentField = -1;
772
773 public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
774 assert fields.length == valueSeparators.length;
775 this.fields = fields;
776 this.valueSeparators = valueSeparators;
777 this.maxLength = maxLength;
778 builders = new StringBuilder[fields.length];
779 for (int i = 0; i < builders.length; i++) {
780 builders[i] = new StringBuilder();
781 }
782 }
783
784 @Override
785 public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
786 String value = new String(bytes, StandardCharsets.UTF_8);
787 assert currentField >= 0;
788 StringBuilder builder = builders[currentField];
789 if (builder.length() > 0 && builder.length() < maxLength) {
790 builder.append(valueSeparators[currentField]);
791 }
792 if (builder.length() + value.length() > maxLength) {
793 builder.append(value, 0, maxLength - builder.length());
794 } else {
795 builder.append(value);
796 }
797 }
798
799 @Override
800 public Status needsField(FieldInfo fieldInfo) throws IOException {
801 currentField = Arrays.binarySearch(fields, fieldInfo.name);
802 if (currentField < 0) {
803 return Status.NO;
804 } else if (builders[currentField].length() > maxLength) {
805 return fields.length == 1 ? Status.STOP : Status.NO;
806 }
807 return Status.YES;
808 }
809
810 String getValue(int i) {
811 return builders[i].toString();
812 }
813
814 void reset() {
815 currentField = -1;
816 for (int i = 0; i < fields.length; i++) {
817 builders[i].setLength(0);
818 }
819 }
820 }
821 }